Preparations

Load the necessary libraries

library(tidyverse) #for data wrangling

Link to the data transformation cheatsheet

https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf

Important data manipulation functions:

Task Function Package
Sorting arrange() dplyr
Adding columns mutate() dplyr
Transformations mutate() dplyr
Re-ordering factor levels factor(,levels=) base
Re-labelling factor(,lab=) base
recode() dplyr
Re-naming columns rename(,replace=) dplyr
Filtering/Subsetting indexing base
~ columns select(,...) dplyr
pull(,...) dplyr
~ rows filter(,...) dplyr
Unique combinations distinct() dplyr
Reshaping data pivot_longer(), pivot_wider() tidyr
Split/combine columns separate(), unite() tidyr
Aggregating group_by() summarise() dplyr
group_by() count() dplyr
Merging/joining *_join() dplyr
Extracting data structure expand() tidyr
crossing() tidyr

Piping

Data files

load(file='../data/manipulationDatasets.RData')
dat.1 %>% head

Sorting data

# Arrange by the Treatment, then Time
dat.1 %>% 
  arrange(Treatment, Time)
# Arrange by Treatment, then by the mean of Resp 1 and Resp2
dat.1 %>% 
  mutate(Resp = mean(c(Resp1, Resp2))) %>%
  arrange(Treatment, Resp)

Adding columns - mutate

dat.1 %>%
  mutate(
    Dose = fct_relevel(Dose, c('L', 'M', 'H')),
    Dose = fct_recode(Dose, High = "H", Medium = "M")) %>%
  as_tibble %>%
  pull(Dose)
##  [1] High   High   High   High   Medium Medium Medium Medium L      L     
## [11] L      L      High   High   High   High   Medium Medium Medium Medium
## [21] L      L      L      L      High   High   High   High   Medium Medium
## [31] Medium Medium L      L      L      L      High   High   High   High  
## [41] Medium Medium Medium Medium L      L      L      L     
## Levels: L Medium High
dat.1 %>% 
  mutate(leadResp2 = lead(Resp2), 
         lagResp1 = lag(Resp1))
dat.1 %>%
  mutate(rank_time = min_rank(Time),
         dens_rank_time = dense_rank(Time))
dat.1 %>%
  mutate(row_Resp1 = row_number(Resp1), 
         row_time = row_number(Time),
         rank_time = min_rank(Time),
         ntile_Resp1 = ntile(Resp1, 4),
         medium_Resp1 = between(Resp1, 20, 40),
         fResp1A = case_when(Resp1 < 31 ~ "Low",
                 between(Resp1, 31, 50) ~ "Medium",
                 Resp1 > 50 ~ "High"),
         fResp1B = cut(Resp1, breaks = c(0, 31, 50, 200),
                              labels = c("Low", "Medium", "High")),
         fResp1C = cut(Resp1, breaks = 2,
                              labels = c("Low", "High")))

Summarising (aggregating) data

SE <- function(x) sd(x) / sqrt(length(x))
dat.1 %>% 
  summarise(MeanResp1 = mean(Resp1), VarResp1 = var(Resp1),
            SEM = SE(Resp1))
# Using anonymous function:
dat.1 %>% summarise(MeanResp1 = mean(Resp1), VarResp1 = var(Resp1),
          SEM = (function(x) sd(x) / sqrt(length(x)))(Resp1))
# Summarize across a number of variables
dat.1 %>% summarise(across(c(Resp1, Resp2), 
                           list(Mean = mean, Var = var)))
# Get mean and var
dat.1 %>% summarise(across(where(is.numeric), 
                           list(Mean = mean, Var = var)))
# Get mean and length
dat.1 %>% summarize(across(where(is.numeric),  mean),
          across(where(is.factor),  length))
Variable <- c("Resp1", "Resp2")
dat.1 %>% summarise(across(all_of(Variable), list(Mean = mean, Var = var)))

Grouping (=aggregating)

dat.1 %>%
  arrange(Resp1) %>%
    group_by(Treatment, Plot) %>%
    summarise(Mean = mean(Resp1),
              Var = var(Resp1),
              N = n(),
              First = first(Resp1))
## `summarise()` regrouping output by 'Treatment' (override with `.groups` argument)
tikus[1:10, c(1:3, 76:77)]
tikus %>% 
  # head(10) %>%
  # select("Psammocora contigua", "Psammocora digitata", "Pocillopora damicornis", "time", "rep") %>%
  arrange(`Pocillopora damicornis`) %>%
  group_by(time) %>%
  summarise(across(all_of(c("Psammocora contigua", "Psammocora digitata", "Pocillopora damicornis")), list(Mean = mean))) %>%
  ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)

Subset columns

dat.1 %>% dplyr::select(-starts_with(c("T","D")))
dat.1 %>% dplyr::select(contains("e"))
dat.1 %>% dplyr::select(where(is_numeric))
## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated

## Warning: Deprecated
dat.1 %>% select(matches("^.{4}$")) # regex for get only columns with 4-characters

Regular expressions (regex)

nasa %>% 
  select(lat, long, starts_with("cloud"))
nasa %>% 
  select(lat, long, matches("^cloud.*$")) %>% head
# Select rep, time, but not any with pora
tikus %>% select(rep, time, !contains("pora")) %>% 
  ncol()
## [1] 40
# Same thing, but older way of doing it
tikus %>% select(rep, time, everything(), -contains("pora")) %>%
  ncol()
## [1] 40
dat.1 %>% rename(Exposure = Treatment, Richness = Resp1) %>%
  relocate(Richness, .before=Plot)

https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf

Filtering

dat.1 %>% filter(Dose == "H" & Resp1 < 25) %>% nrow
## [1] 6
dat.1 %>% filter(Dose == "H" | Resp1 < 25) %>% nrow
## [1] 18

Be careful about not being ambiguous!

dat.1 %>%
  filter(Resp1 < 40 & (Time > 1 | Dose == "L")) %>% nrow
## [1] 10
dat.1 %>%
  filter((Resp1 < 40 & Time > 1) | Dose == "L") %>% nrow
## [1] 24
nasa %>% 
  dplyr::filter(month == 2, year == max(year)) %>%
  filter(ozone == max(ozone))
nasa %>% 
  dplyr::filter(month == 2 & year == max(year)) %>%
  arrange(-ozone) %>% slice(1)
nasa %>% filter(month == 1, year == max(year)) %>%
  filter(ozone > 320 & ozone < 325)
nasa %>% 
  filter(month == 1, year == max(year), between(ozone, 320, 325))

Always make sure to drop ‘ghost’ levels

dat.1 %>%
  filter(Plot == "P1") %>%
  pull(Plot) %>%
  levels
## [1] "P1" "P2" "P3" "P4"
dat.1 %>%
  filter(Plot == "P1") %>%
  pull(Plot) %>%
  droplevels()
##  [1] P1 P1 P1 P1 P1 P1 P1 P1 P1 P1 P1 P1
## Levels: P1

Reshaping data

Pivot longer

data.w %>%
  pivot_longer(cols = starts_with("Time"),
               names_to = "time", values_to = "min") %>%
  mutate(time = as.numeric(str_extract(time, "(?<=Time.).")))
# easier way:
data.w %>%
  pivot_longer(cols = starts_with("Time"),
               names_to = "time", values_to = "min",
               names_prefix = "Time.") %>% # makes it way easier!!
  mutate(time = as.numeric(time))

Pivot wider

data %>%
  pivot_wider(names_from = Within, values_from = c(Resp1, Resp2))

Combining data

data.bio <- data.bio %>% relocate(Resp1, Resp2, .after=Subplot)

data.bio %>% full_join(data.chem) %>%
  arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% left_join(data.chem) %>%
  arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% right_join(data.chem) %>%
  arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% inner_join(data.chem) %>%
  arrange(Plot, Subplot)
## Joining, by = c("Between", "Plot", "Subplot")
data.bio %>% rbind(data.bio[6:9,]) %>%
  left_join(data.chem) %>%
  arrange(Plot, Subplot) # same as vlookup!
## Joining, by = c("Between", "Plot", "Subplot")

Applied examples

tikus %>%
  rename(`Acropora aspera` = `Acropera aspera`) %>%
  select(time, rep, starts_with("Acropora")) %>%
  pivot_longer(cols = c(-rep, -time), names_to = "species", values_to = "abundance") %>%
  mutate(cover = abundance / (10*100),
         per_cover = cover * 100) %>%
  group_by(time, rep) %>%
  summarise(total_acropora_cover = sum(cover)) %>%
  ungroup() %>%
  group_by(time) %>% # averaging across all sites now
  summarise(mean_yearly_acropora_cover = mean(total_acropora_cover),
            sd_yearly_acropora_cover = sd(total_acropora_cover)) %>%
  ungroup() %>%
  mutate(year = 1900 + as.numeric(as.character(time)))
## `summarise()` regrouping output by 'time' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
summary_tikus <- 
  tikus %>%
  rename(`Acropora aspera` = `Acropera aspera`) %>%
  # select(time, rep, starts_with("Acropora")) %>%
  pivot_longer(cols = c(-rep, -time), names_to = "species", values_to = "abundance") %>%
  mutate(cover = abundance / (10*100),
         per_cover = cover * 100) %>%
  separate(col = species, c("genus", "species")) %>%
  group_by(genus, time, rep) %>%
  summarise(total_cover = sum(cover)) %>%
  ungroup() %>%
  mutate(genus = fct_lump_n(genus, 5, total_cover)) %>%
  group_by(genus, time) %>% # averaging across all sites now
  summarise(mean_yearly_cover = mean(total_cover),
            sd_yearly_cover = sd(total_cover)) %>%
  ungroup() %>%
  mutate(year = 1900 + as.numeric(as.character(time)))
## `summarise()` regrouping output by 'genus', 'time' (override with `.groups` argument)
## `summarise()` regrouping output by 'genus' (override with `.groups` argument)
summary_tikus
theme_set(theme_classic())
summary_tikus %>%
   ggplot(aes(x = year, y = mean_yearly_cover, group = genus)) +
  facet_wrap(~genus) +
   geom_line(linetype = "dashed") +
  geom_area(aes(fill = genus), alpha = 0.5) +
   labs(x = "Year", y = "Mean Acropora Cover") +
   scale_y_continuous(labels = scales::label_percent(),
                      expand = expansion()) +
  scale_x_continuous(expand = expansion())